/**************************************************************************
* This macro pulls pension records form 5500 data for 2009-2011.
*
* The 2009 format is slightly different and these data are processed
* separately from the 2000-2008 data.
* 
* After the 2010 file year, all forms are electronically processed
* so the cleaning and sampling is no longer done for the research
* files. For these years, I rely only on the research files produced 
* by DoL.
*
/*************************************************************************/

/*2009*/

%macro pullplans_09(yyyy,yy);

	/**************************************************************************
	* Step 1: Input the research file for year=yy
	***************************************************************************/
	data res&yy
		(keep = DB
			DC
			cash_bal
			xDB
			plan_eff_year
			plan_qtr_strt
			plan_year_strt
			plan_qtr_end
			plan_year_end
			form_year
			ack_id
			filing_id
			filing_id_char
			adj_active
			collective_bargain_ind
			large
			last_rpt_plan_num
			last_rpt_spons_ein
			opr_ein
			opr_pn
			partcp_account_bal_cnt
			pension_benefit_plan_ind
			soleplan
			tot_active_partcp_cnt
			type_pension_bnft_code
			type_plan_entity_ind
			source_id);
		set red.bul&yy (drop = DB DC);	/*remove pre-coded DB DC flags*/
		where best_for_plan = /*redacted*/;				/*keep best filing only (no duplicates)*/
		if /*redacted*/ then DC=/*redacted*/;	/*has DC characteristics*/
			else DC=/*redacted*/;
		if /*redacted*/ then DB=/*redacted*/;	/*has DB characteristics*/
			else DB=/*redacted*/;
		if /*redacted*/ then cash_bal=/*redacted*/;	/*is cash balance*/
			else cash_bal=/*redacted*/;
		if DB=/*redacted*/ & cash_bal=/*redacted*/ then xDB =/*redacted*/;				/*is DB but not cash_bal*/
			else xDB=/*redacted*/;
		if /*redacted*/ OR 
		/*redacted*/ then supplemental=/*redacted*/;	/*is supplemental*/
			else supplemental=/*redacted*/;
		if supplemental=/*redacted*/ then delete;	/*drop supplemental plans*/
		
		fpy_begin_yr = /*redacted*/; 	/*plan start year*/
		fpy_begin_mth = /*redacted*/; 	/*plan start month*/
		fpy_begin_day = /*redacted*/;	/*plan start day*/
		fpy_date = mdy(fpy_begin_mth,fpy_begin_day,fpy_begin_yr);		/*plan start sas date*/
		plan_qtr_strt = qtr(fpy_date);						/*plan calendar start qtr*/
		plan_year_strt = year(fpy_date);					/*plan calendar start year*/

		form_tax_prd = left(trim(form_tax_prd)); 
		ftp_yr = /*redacted*/; 				/*plan end year*/
		ftp_mth = /*redacted*/; 				/*plan end month*/
		ftp_day = /*redacted*/;				/*plan end day*/
		ftp_date = mdy(ftp_mth,ftp_day,ftp_yr);					/*plan end sas date*/
		plan_qtr_end = qtr(ftp_date);						/*plan calendar end qtr*/
		plan_year_end = year(ftp_date);						/*plan calendar end year*/

		type_plan_entity_ind = left(trim(type_plan_entity_cd));
		if type_plan_entity_ind^=/*redacted*/ then delete;			/*keep only single employer plans*/

		form_year = 20&yy;						/*form year*/
		filing_id = /*redacted*/;			/*numeric id*/
		filing_id_char = /*redacted*/;				/*string filing id*/
		source_id = /*redacted*/;        						/*id for research file*/
		plan_eff_year = /*redacted*/;			/*year that the plan started*/

		if DB=/*redacted*/ and DC=/*redacted*/ then delete; 					/*remove welfare plans*/
		pension_benefit_plan_ind = /*redacted*/;					/*all plans are now pension plans*/		
	run;

	/**************************************************************************
	* Step 2: Identify the set of records that are NOT in the research file 
	* but ARE in the raw file
	***************************************************************************/
	
	/*Input the raw file*/
	data raw&yy;
		set green.f5500fileyears_2000_2014 (drop = filing_id);
		where fileyear=20&yy;
		filing_id = /*redacted*/;				/*numeric id*/
		filing_id_char = /*redacted*/;				/*string filing id*/
	run;
	
	proc sql;
		create table xclud&yy as
		select a.filing_id as res_id, b.*
		from res&yy as a right join
		raw&yy as b
		on a.filing_id = b.filing_id;
	quit;
	
	/*Obs in the raw file that are not in the research file*/
	data xclud&yy;
		set xclud&yy;
		if res_id^=. then delete; /*records not in the research file*/
	run;
	
	/**************************************************************************
	* Step 3: clean the xclud plans file and make variable names consistent
	***************************************************************************/
	
	data xclud&yy
		(keep = DB
			DC
			cash_bal
			xDB
			plan_eff_year
			plan_qtr_strt
			plan_year_strt
			plan_qtr_end
			plan_year_end
			form_year
			ack_id
			filing_id
			filing_id_char
			adj_active
			collective_bargain_ind
			large
			last_rpt_plan_num
			last_rpt_spons_ein
			opr_ein
			opr_pn
			partcp_account_bal_cnt
			pension_benefit_plan_ind
			tot_active_partcp_cnt
			type_pension_bnft_code
			type_plan_entity_ind
			t					/*this is used to de-duplicate*/
		);												
		set xclud&yy;
		if /*redacted*/ then DC=/*redacted*/;	/*has DC characteristics*/
			else DC=/*redacted*/;
		if /*redacted*/ then DB=/*redacted*/;	/*has DB characteristics*/
			else DB=/*redacted*/;
		if /*redacted*/ then cash_bal=/*redacted*/;	/*is cash balance*/
			else cash_bal=/*redacted*/;
		if DB=/*redacted*/ & cash_bal=/*redacted*/ then xDB =/*redacted*/;				/*is DB but not cash_bal*/
			else xDB=/*redacted*/;
		plan_name = upcase(plan_name);
		if /*redacted*/ or 
		/*redacted*/ then supplemental=/*redacted*/;	/*is supplemental*/
			else supplemental=/*redacted*/;
		if supplemental=/*redacted*/ then delete;	/*drop supplemental plans*/
	
		plan_qtr_strt = qtr(plan_year_begin_date);				/*plan calendar start qtr*/
		plan_year_strt = year(plan_year_begin_date);				/*plan calendar start year*/

		plan_qtr_end = qtr(plan_year_end_date);					/*plan calendar end qtr*/
		plan_year_end = year(plan_year_end_date);				/*plan calendar end year*/
	
		form_year = fileyear;						/*rename for consistency with research file*/
		adj_active = tot_active_partcp_cnt;				/*no editing for the raw file*/
		large = /*redacted*/; 							
		opr_ein = left(trim(spons_dfe_ein));				/*rename for consistency with research file*/
		opr_pn = left(trim(spons_dfe_pn));				/*rename for consistency with research file*/
		len_pn = length(opr_pn);					/*length of pn field*/
		if len_pn=/*redacted*/ then opr_pn = /*redacted*/;
		else if len_pn=/*redacted*/ then opr_pn = /*redacted*/;
		plan_eff_year = /*redacted*/;		/*year that the plan started*/
	
		type_plan_entity_ind = left(trim(type_plan_entity_cd));
		if type_plan_entity_ind=/*redacted*/ or type_plan_entity_ind=/*redacted*/ then /*take out multi employer plans*/
			not_multiemployer=/*redacted*/;
		else not_multiemployer=/*redacted*/;
		if not_multiemployer=/*redacted*/ then delete;
	
		if DB=/*redacted*/ and DC=/*redacted*/ then delete; 					/*remove welfare plans*/
		pension_benefit_plan_ind = /*redacted*/;					/*all plans are now pension plans*/
		t = /*redacted*/;								/*helper used for de-duplication step*/
	run;
	
	/*Set variable lengths to enable error-free appending with the research file*/
	data xclud&yy;
	
		/*redacted*/
		
		set xclud&yy;
	run;
		
	/**************************************************************************
	* Step 4: De-duplicate the file (pick the max filing id for each EIN-PN)
	***************************************************************************/
	
	proc sort data = xclud&yy;
		by opr_ein opr_pn plan_year_end;
	run;
	
	proc means noprint data = xclud&yy;
		by opr_ein opr_pn plan_year_end;
		var filing_id;
		output out = best_filing 
		max(filing_id) = max_filing_id;
	run;
	
	proc sql;
		create table xclud&yy._clean as
		select a.*, b.max_filing_id
		from xclud&yy as a inner join
		best_filing as b
		on a.filing_id = b.max_filing_id;
	quit;
	
	data  xclud&yy._clean;
		set xclud&yy._clean (drop= t max_filing_id);
		source_id = /*redacted*/; /*id for raw file*/
	run;
	
	/**************************************************************************
	* Step 5: Stack plans from the two files together
	***************************************************************************/
	
	data stack&yy;
		set res&yy xclud&yy._clean;
	run;
	
	/*
	  De-duplicate in case of repeated entries
	  research file entry gets priority when dups occur
	*/
	
	proc sort data = stack&yy;
		by opr_ein opr_pn plan_year_end;
	run;
	
	proc means noprint data = stack&yy;
		by opr_ein opr_pn plan_year_end;
		var source_id;
		output out = deduper
		min(source_id) = min_source_id;
	run;
	
	proc sql;
		create table pens&yy as
		select a.*, b.min_source_id, b.opr_ein, b.opr_pn
		from stack&yy as a left join
		deduper as b
		on a.opr_ein = b.opr_ein & a.opr_pn = b.opr_pn & a.plan_year_end = b.plan_year_end;;
	quit;

	data pens&yy;
		set pens&yy;
		if source_id^=min_source_id then delete;
	run;
	
	/*Remove exact filing_id dups*/
	proc sort data = pens&yy nodupkey;
	      by opr_ein opr_pn plan_year_end filing_id_char;
	run;

	/**************************************************************************
	* Step 6: Count up # DB plans per EIN
	***************************************************************************/
	
	proc sort data = pens&yy;
		by opr_ein;
	run;
	
	proc means noprint data = pens&yy;
		by opr_ein;
		var DB;				
		output out = plan_count&yy
		sum(DB) = num_DB_plans;
	run;
	
	/*Check how many participants are in firms with 2+ DB plans*/
	proc sql;
		create table pens&yy._plncnt as 
		select *
		from pens&yy as t,
		plan_count&yy as m
		where t.opr_ein = m.opr_ein;
	quit;
		
	data blue.pensplan_samp&yy (drop =
		xDB
		min_source_id
		soleplan
		_freq_
		_type_);
		set pens&yy._plncnt;
		adj_DB_partcp = DB*adj_active;
		DB_partcp = DB*tot_active_partcp_cnt;
	run;
	
	/*De-dup small number of cases where filing errors create EIN-PN_Plan year end dups
	  i.e. the filing_id's are different but the plan years are mis-aligned*/
	proc sort data = blue.pensplan_samp&yy nodupkey;
	  by opr_ein opr_pn plan_year_end;
	run;

	proc freq data = blue.pensplan_samp&yy;
	  tables DB*DC;
	run;
	
	/**************************************************************************
	* Step 7: Add data from schedule H (employer and employee contributions)
	***************************************************************************/
	
	/*Input schedule H raw data*/
	PROC IMPORT OUT= WORK.schh&yyyy 
		    DATAFILE= "/.../F5500/f5500/F_SCH_H_&yyyy..csv" 
		    DBMS=CSV REPLACE;
	    GETNAMES=YES;
	    DATAROW=/*redacted*/; 
	RUN;

	data schh&yyyy  (keep = filing_id_char opr_ein opr_pn emplr_contrib_income_amt_n employee_contrib_income_amt_n tot_income_amt_n plan_year_end);
		set schh&yyyy (rename = (emplr_contrib_income_amt=emplr_contrib_income_amt_n 
		participant_contrib_amt=employee_contrib_income_amt_n
		tot_income_amt = tot_income_amt_n));
		filing_id = /*redacted*/;				/*numeric id*/
		filing_id_char = /*redacted*/;				/*string filing id*/
		opr_ein = left(trim(sch_h_ein));
		opr_pn = left(trim(sch_h_pn));
		schh_tax_prd = left(trim(sch_h_tax_prd)); 
		ftp_yr = /*redacted*/; 				/*plan end year*/
		ftp_mth = /*redacted*/; 				/*plan end month*/
		ftp_day = /*redacted*/;				/*plan end day*/
		ftp_date = mdy(ftp_mth,ftp_day,ftp_yr);					/*plan end sas date*/
		plan_qtr_end = qtr(ftp_date);						/*plan calendar end qtr*/
		plan_year_end = year(ftp_date);						/*plan calendar end year*/		
	run;

	/*Input schedule I raw data*/
	PROC IMPORT OUT= WORK.schi&yyyy 
		    DATAFILE= "/.../F5500/f5500/F_SCH_I_&yyyy..csv" 
		    DBMS=CSV REPLACE;
	    GETNAMES=YES;
	    DATAROW=/*redacted*/; 
	RUN;

	data schi&yyyy (keep = filing_id_char opr_ein opr_pn emplr_contrib_income_amt_n employee_contrib_income_amt_n tot_income_amt_n plan_year_end);
		set schi&yyyy (rename = (small_emplr_contrib_income_amt=emplr_contrib_income_amt_n 
		small_participant_contrib_amt=employee_contrib_income_amt_n
		small_tot_income_amt = tot_income_amt_n));
		filing_id = /*redacted*/;				/*numeric id*/
		filing_id_char = /*redacted*/;				/*string filing id*/
		opr_ein = left(trim(sch_i_ein));
		opr_pn = left(trim(sch_i_plan_num));
		schi_tax_prd = left(trim(sch_i_tax_prd)); 
		ftp_yr = /*redacted*/; 				/*plan end year*/
		ftp_mth = /*redacted*/; 				/*plan end month*/
		ftp_day = /*redacted*/;				/*plan end day*/
		ftp_date = mdy(ftp_mth,ftp_day,ftp_yr);					/*plan end sas date*/
		plan_qtr_end = qtr(ftp_date);						/*plan calendar end qtr*/
		plan_year_end = year(ftp_date);						/*plan calendar end year*/		
	run;
	
	/*SF raw data*/
	data sf&yyyy (keep = filing_id_char opr_ein opr_pn emplr_contrib_income_amt_n employee_contrib_income_amt_n tot_income_amt_n plan_year_end);
	      set green.f5500fileyears_2000_2014 (rename = (emplr_contrib_income_amt=emplr_contrib_income_amt_n 
	      particip_contrib_income_amt=employee_contrib_income_amt_n
	      tot_income_amt = tot_income_amt_n));
	      where fileyear=&yyyy & sfschedule=1;
	      opr_ein = left(trim(spons_dfe_ein));				/*rename for consistency with research file*/
	      opr_pn = left(trim(spons_dfe_pn));				/*rename for consistency with research file*/
	      len_pn = length(opr_pn);						/*length of pn field*/
	      if len_pn=/*redacted*/ then opr_pn = /*redacted*/;
	      else if len_pn=/*redacted*/ then opr_pn = /*redacted*/;
	      /*Redacted: removing invalid EIN/PNs*/
	      filing_id_char = /*redacted*/;	
	      form_tax_prd = left(trim(form_tax_prd)); 
	      ftp_yr = /*redacted*/; 				/*plan end year*/
	      ftp_mth = /*redacted*/; 				/*plan end month*/
	      ftp_day = /*redacted*/;				/*plan end day*/
	      ftp_date = mdy(ftp_mth,ftp_day,ftp_yr);					/*plan end sas date*/
	      plan_qtr_end = qtr(ftp_date);						/*plan calendar end qtr*/
	      plan_year_end = year(ftp_date);						/*plan calendar end year*/	      
	run;
	      

	/*Append the files*/
	data blue.schhisf&yyyy;
	      set schh&yyyy schi&yyyy sf&yyyy;
	run;
	
	/*De-duplicate H,I,SF schedules*/
	proc sort data = blue.schhisf&yyyy nodupkey;
	      by filing_id_char;
	run;	
	
	/*Clean up the working directory*/
	proc datasets lib=work nolist kill;
	quit;
	run;


%mend pullplans_09;


/*2010-2011*/

%macro pullplans_1014(yy);

	/**************************************************************************
	* Step 1: Input the research file for year=yy (no sampling post 2010)
	***************************************************************************/
	data pens&yy
		(keep = DB
			DC
			cash_bal
			xDB
			plan_eff_year
			plan_qtr_strt
			plan_year_strt
			plan_qtr_end
			plan_year_end
			form_year
			ack_id
			filing_id
			filing_id_char
			adj_active
			collective_bargain_ind
			large
			last_rpt_plan_num
			last_rpt_spons_ein
			opr_ein
			opr_pn
			partcp_account_bal_cnt
			pension_benefit_plan_ind
			soleplan
			tot_active_partcp_cnt
			type_pension_bnft_code
			type_plan_entity_ind
			source_id
			emplr_contrib_income_ame
			participant_contrib_ame
			tot_income_ame
		 rename = (emplr_contrib_income_ame = emplr_contrib_income_amt_n
		           participant_contrib_ame = employee_contrib_income_amt_n
		           tot_income_ame = tot_income_amt_n));
		set red.bul&yy (drop = DB DC);	/*remove pre-coded DB DC flags*/
		where best_for_plan = /*redacted*/;	/*keep best filing only (no duplicates)*/
		if /*redacted*/ then DC=/*redacted*/;	/*has DC characteristics*/
			else DC=/*redacted*/;
		if /*redacted*/ then DB=/*redacted*/;	/*has DB characteristics*/
			else DB=/*redacted*/;
		if /*redacted*/ then cash_bal=/*redacted*/;	/*is cash balance*/
			else cash_bal=/*redacted*/;
		if DB=/*redacted*/ & cash_bal=/*redacted*/ then xDB =/*redacted*/;				/*is DB but not cash_bal*/
			else xDB=/*redacted*/;
		if /*redacted*/ OR 
		/*redacted*/ then supplemental=/*redacted*/;	/*is supplemental*/
			else supplemental=/*redacted*/;
		if supplemental=/*redacted*/ then delete;	/*drop supplemental plans*/
		
		fpy_begin_yr = /*redacted*/; 	/*plan start year*/
		
		if /*redacted*/ & /*redacted*/ then fpy_begin_yr=/*redacted*/; 	
		else if /*redacted*/ & /*redacted*/ then fpy_begin_yr=/*redacted*/; 
			
		fpy_begin_mth = /*redacted*/; 	/*plan start month*/
		fpy_begin_day = /*redacted*/;	/*plan start day*/
		fpy_date = mdy(fpy_begin_mth,fpy_begin_day,fpy_begin_yr);		/*plan start sas date*/
		plan_qtr_strt = qtr(fpy_date);						/*plan calendar start qtr*/
		plan_year_strt = year(fpy_date);					/*plan calendar start year*/

		form_tax_prd = left(trim(form_tax_prd)); 
		ftp_yr = /*redacted*/; 				/*plan end year*/
		ftp_mth = /*redacted*/; 				/*plan end month*/
		ftp_day = /*redacted*/;				/*plan end day*/
		ftp_date = mdy(ftp_mth,ftp_day,ftp_yr);					/*plan end sas date*/
		plan_qtr_end = qtr(ftp_date);						/*plan calendar end qtr*/
		plan_year_end = year(ftp_date);						/*plan calendar end year*/

		type_plan_entity_ind = left(trim(type_plan_entity_cd));
		if type_plan_entity_ind^="2" then delete;			/*keep only single employer plans*/

		form_year = 20&yy;						/*form year*/
		filing_id = /*redacted*/;			/*numeric id*/
		filing_id_char = /*redacted*/;				/*string filing id*/
		source_id = /*redacted*/;        						/*id for research file*/
		plan_eff_year = /*redacted*/;			/*year that the plan started*/

		if DB=/*redacted*/ and DC=/*redacted*/ then delete; 					/*remove welfare plans*/
		pension_benefit_plan_ind = /*redacted*/;					/*all plans are now pension plans*/
		
	run;

	/**************************************************************************
	* Step 2: Identify employers with at least 1 DB plan 
	* in a filing year
	***************************************************************************/
	
	proc sort data = pens&yy;
		by opr_ein;
	run;
	
	proc means noprint data = pens&yy;
		by opr_ein;
		var DB;				
		output out = plan_count&yy
			sum(DB) = num_DB_plans;
	run;
	
	/*Check how many participants are in firms with 2+ DB plans*/
	proc sql;
		create table pens&yy._plncnt as 
		select *
		from pens&yy as t,
		plan_count&yy as m
		where t.opr_ein = m.opr_ein;
	quit;
		
	data blue.pensplan_samp&yy (drop =
		xDB
		soleplan
		_type_
		_freq_);
		set pens&yy._plncnt;
		adj_DB_partcp = DB*adj_active;
		DB_partcp = DB*tot_active_partcp_cnt;
	run;
	
	/*Clean up the working directory*/
	proc datasets lib=work nolist kill;
	quit;
	run;

%mend pullplans_1014;

